This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
#spotify[cols_to_convert] <- lapply(spotify[cols_to_convert], function(x) as.numeric(as.character(x)))
#spotify$streams <- as.numeric(spotify$streams)
spotify$in_deezer_playlists = as.numeric(spotify$in_deezer_playlists)
G2;H2;Warningh: NAs introduced by coerciong
spotify$in_shazam_charts = as.numeric(spotify$in_shazam_charts)
G2;H2;Warningh: NAs introduced by coerciong
str(spotify[, 3:14])
'data.frame': 953 obs. of 12 variables:
$ artist_count : int 2 1 1 1 1 2 2 1 1 2 ...
$ released_year : int 2023 2023 2023 2019 2023 2023 2023 2023 2023 2023 ...
$ released_month : int 7 3 6 8 5 6 3 7 5 3 ...
$ released_day : int 14 23 30 23 18 1 16 7 15 17 ...
$ in_spotify_playlists: int 553 1474 1397 7858 3133 2186 3090 714 1096 2953 ...
$ in_spotify_charts : int 147 48 113 100 50 91 50 43 83 44 ...
$ streams : chr "141381703" "133716286" "140003974" "800840817" ...
$ in_apple_playlists : int 43 48 94 116 84 67 34 25 60 49 ...
$ in_apple_charts : int 263 126 207 207 133 213 222 89 210 110 ...
$ in_deezer_playlists : num 45 58 91 125 87 88 43 30 48 66 ...
$ in_deezer_charts : int 10 14 14 12 15 17 13 13 11 13 ...
$ in_shazam_charts : num 826 382 949 548 425 946 418 194 953 339 ...
pairs(spotify[, 3:14], main = "Linear Relationships Between Metrics")
G1;H1;Errorh in pairs.default(spotify[, 3:14], main = "Linear Relationships Between Metrics") :
non-numeric argument to 'pairs'
g
summary(spotify)
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts
Length:953 Length:953 Min. :1.000 Min. :1930 Min. : 1.000 Min. : 1.00 Min. : 31 Min. : 0.00
Class :character Class :character 1st Qu.:1.000 1st Qu.:2020 1st Qu.: 3.000 1st Qu.: 6.00 1st Qu.: 875 1st Qu.: 0.00
Mode :character Mode :character Median :1.000 Median :2022 Median : 6.000 Median :13.00 Median : 2224 Median : 3.00
Mean :1.556 Mean :2018 Mean : 6.034 Mean :13.93 Mean : 5200 Mean : 12.01
3rd Qu.:2.000 3rd Qu.:2022 3rd Qu.: 9.000 3rd Qu.:22.00 3rd Qu.: 5542 3rd Qu.: 16.00
Max. :8.000 Max. :2023 Max. :12.000 Max. :31.00 Max. :52898 Max. :147.00
streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm key
Min. :2.762e+03 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 65.0 Length:953
1st Qu.:1.416e+08 1st Qu.: 13.00 1st Qu.: 7.00 1st Qu.: 12.0 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:100.0 Class :character
Median :2.905e+08 Median : 34.00 Median : 38.00 Median : 36.5 Median : 0.000 Median : 2.00 Median :121.0 Mode :character
Mean :5.141e+08 Mean : 67.81 Mean : 51.91 Mean :109.7 Mean : 2.666 Mean : 51.18 Mean :122.5
3rd Qu.:6.739e+08 3rd Qu.: 88.00 3rd Qu.: 87.00 3rd Qu.:110.0 3rd Qu.: 2.000 3rd Qu.: 36.00 3rd Qu.:140.0
Max. :3.704e+09 Max. :672.00 Max. :275.00 Max. :974.0 Max. :58.000 Max. :953.00 Max. :206.0
NA's :1 NA's :79 NA's :57
mode danceability_. valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_.
Length:953 Min. :23.00 Min. : 4.00 Min. : 9.00 Min. : 0.00 Min. : 0.000 Min. : 3.00 Min. : 2.00
Class :character 1st Qu.:57.00 1st Qu.:32.00 1st Qu.:53.00 1st Qu.: 6.00 1st Qu.: 0.000 1st Qu.:10.00 1st Qu.: 4.00
Mode :character Median :69.00 Median :51.00 Median :66.00 Median :18.00 Median : 0.000 Median :12.00 Median : 6.00
Mean :66.97 Mean :51.43 Mean :64.28 Mean :27.06 Mean : 1.581 Mean :18.21 Mean :10.13
3rd Qu.:78.00 3rd Qu.:70.00 3rd Qu.:77.00 3rd Qu.:43.00 3rd Qu.: 0.000 3rd Qu.:24.00 3rd Qu.:11.00
Max. :96.00 Max. :97.00 Max. :97.00 Max. :97.00 Max. :91.000 Max. :97.00 Max. :64.00
colSums(is.na(spotify))
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists
0 0 0 0 0 0 0
in_spotify_charts streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts
0 1 0 0 79 0 57
bpm key mode danceability_. valence_. energy_. acousticness_.
0 0 0 0 0 0 0
instrumentalness_. liveness_. speechiness_.
0 0 0
dim(spotify)
[1] 953 24
library(ggplot2)
ggplot(spotify, aes(x = released_year)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "white") +
labs(title = "Distribution of Streams", x = names(spotify$released_year))
plot(density(spotify$released_year, na.rm = TRUE), main = "Density Plot of Released Year", xlab = "Released Year", col = "blue", lwd = 2)
View(spotify)
# Basic scatter plot with color based on 'mode'
ggplot(spotify, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
library(shiny)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Streams vs Spotify Playlists by Mode"),
sidebarLayout(
sidebarPanel(
checkboxGroupInput("mode_select", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("scatterPlot")
)
)
)
# Server
server <- function(input, output) {
output$scatterPlot <- renderPlot({
filtered_data <- spotify[spotify$mode %in% input$mode_select, ]
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(
title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists"
) +
theme_minimal()
})
}
# Run the app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
library(dplyr)
# Create a combined label of Song + Artist
spotify <- spotify %>%
mutate(song.artist = paste(track_name, "-", artist.s._name))
yearly_top_song <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 1, with_ties = TRUE) %>%
ungroup()
# Step 1: Get the top 10 songs by total streams
top10_yearly <- yearly_top_song %>%
arrange(desc(streams)) %>%
slice(1:10)
top10_yearly
# Convert song_artist to factor with levels ordered by Streams
top10_yearly <- top10_yearly %>%
arrange(desc(streams)) %>%
mutate(song.artist = factor(song.artist, levels = unique(song.artist)))
top10_yearly
ggplot(top10_yearly, aes(x = released_year, y = streams, fill = factor(song.artist))) +
geom_bar(stat = "identity") +
labs(title = "Top Streamed Songs per Year",
x = "Track (Song - Artist)",
y = "Number of Streams",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(top10_yearly, aes(x = factor(released_year), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Streamed Songs By Year",
x = "Year",
y = "Number of Streams",
fill = "Song-Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
yearly_top_songs <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 5, with_ties = TRUE) %>%
ungroup()
View(yearly_top_songs)
# Filter for 2023 top 5 songs from your previously filtered data
top_2023 <- yearly_top_songs %>%
filter(released_year == 2023)
# Create the bar chart
ggplot(top_2023, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Streamed Songs in 2023",
x = "Song",
y = "Number of Streams",
fill = "Song & Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Check how many rows are in the dataset for 2022
dim(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
[1] 5 25
# Check for NA or invalid values in 2022
summary(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts
Length:5 Length:5 Min. :1.0 Min. :2022 Min. :3.0 Min. : 6.0 Min. : 8506 Min. : 42.0
Class :character Class :character 1st Qu.:1.0 1st Qu.:2022 1st Qu.:5.0 1st Qu.: 6.0 1st Qu.: 8576 1st Qu.: 42.0
Mode :character Mode :character Median :2.0 Median :2022 Median :5.0 Median : 6.0 Median : 8870 Median : 43.0
Mean :1.6 Mean :2022 Mean :5.8 Mean :14.2 Mean :11713 Mean : 60.4
3rd Qu.:2.0 3rd Qu.:2022 3rd Qu.:7.0 3rd Qu.:22.0 3rd Qu.: 9037 3rd Qu.: 45.0
Max. :2.0 Max. :2022 Max. :9.0 Max. :31.0 Max. :23575 Max. :130.0
streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm key
Min. :1.231e+09 Min. : 94.0 Min. : 65.0 Min. :139.0 Min. :14.0 Min. : 49.0 Min. : 92.0 Length:5
1st Qu.:1.264e+09 1st Qu.:104.0 1st Qu.:108.0 1st Qu.:141.0 1st Qu.:14.0 1st Qu.:127.8 1st Qu.:107.0 Class :character
Median :1.357e+09 Median :124.0 Median :120.0 Median :164.0 Median :26.0 Median :160.0 Median :128.0 Mode :character
Mean :1.561e+09 Mean :188.2 Mean :124.8 Mean :327.6 Mean :25.2 Mean :136.2 Mean :126.4
3rd Qu.:1.441e+09 3rd Qu.:216.0 3rd Qu.:133.0 3rd Qu.:331.0 3rd Qu.:26.0 3rd Qu.:168.5 3rd Qu.:131.0
Max. :2.513e+09 Max. :403.0 Max. :198.0 Max. :863.0 Max. :46.0 Max. :176.0 Max. :174.0
NA's :1
mode danceability_. valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_. song.artist
Length:5 Min. :52.0 Min. :19.0 Min. :47.0 Min. : 1 Min. :0.0 Min. : 9.0 Min. : 4.0 Length:5
Class :character 1st Qu.:62.0 1st Qu.:24.0 1st Qu.:71.0 1st Qu.: 1 1st Qu.:0.0 1st Qu.:13.0 1st Qu.: 6.0 Class :character
Mode :character Median :65.0 Median :43.0 Median :72.0 Median : 9 Median :0.0 Median :23.0 Median : 8.0 Mode :character
Mean :68.2 Mean :41.4 Mean :68.2 Mean :11 Mean :0.6 Mean :20.6 Mean :10.4
3rd Qu.:71.0 3rd Qu.:55.0 3rd Qu.:73.0 3rd Qu.:10 3rd Qu.:0.0 3rd Qu.:27.0 3rd Qu.: 9.0
Max. :91.0 Max. :66.0 Max. :78.0 Max. :34 Max. :3.0 Max. :31.0 Max. :25.0
# Alternatively, print it to inspect
print(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
# Look at 2022 data closely
spotify %>%
filter(released_year == 2022) %>%
select(track_name, artist.s._name, streams) %>%
glimpse()
Rows: 402
Columns: 3
$ track_name <chr> "As It Was", "Kill Bill", "Calm Down (with Selena Gomez)", "Creepin'", "Anti-Hero", "I'm Good (Blue)", "I Ain't Worried", "La Ba…
$ artist.s._name <chr> "Harry Styles", "SZA", "R��ma, Selena G", "The Weeknd, 21 Savage, Metro Boomin", "Taylor Swift", "Bebe Rexha, David Guetta", "On…
$ streams <dbl> 2513188493, 1163093654, 899183384, 843957510, 999748277, 1109433169, 1085685420, 1214083358, 720434240, 674072710, 404562836, 37…
# Count unique songs to see if there's a tie issue
yearly_top_songs %>%
filter(released_year == 2022) %>%
count(track_name)
# Check for NAs in streams or grouping variables
yearly_top_songs %>%
filter(released_year == 2022) %>%
summarise(
missing_streams = sum(is.na(streams)),
missing_track = sum(is.na(track_name)),
missing_artist = sum(is.na(song.artist)))
# Shiny app to view top streamed songs by year with a toggle
library(shiny)
library(dplyr)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Top Streamed Songs by Year"),
sidebarLayout(
sidebarPanel(
selectInput("year", "Select Year:", choices = sort(unique(yearly_top_songs$released_year)))
),
mainPanel(
plotOutput("topSongsPlot")
)
)
)
# Server
server <- function(input, output, session) {
output$topSongsPlot <- renderPlot({
selected_year_data <- yearly_top_songs %>%
filter(released_year == input$year)
# Ensure no invalid characters or encoding issues in track names
selected_year_data$track_name <- iconv(selected_year_data$track_name, from = "UTF-8", to = "UTF-8", sub = "*")
ggplot(selected_year_data, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = paste("Top 5 Streamed Songs in", input$year),
x = "Song",
y = "Number of Streams",
fill = "Song & Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
}
# Run app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
#view(top_2023)
names(top_2023)
[1] "track_name" "artist.s._name" "artist_count" "released_year" "released_month" "released_day"
[7] "in_spotify_playlists" "in_spotify_charts" "streams" "in_apple_playlists" "in_apple_charts" "in_deezer_playlists"
[13] "in_deezer_charts" "in_shazam_charts" "bpm" "key" "mode" "danceability_."
[19] "valence_." "energy_." "acousticness_." "instrumentalness_." "liveness_." "speechiness_."
[25] "song.artist"
# Load packages explicitly
library(dplyr)
library(tidyr)
# Now explicitly call dplyr::select() to avoid masking
top_2023_features <- top_2023 %>%
dplyr::select(
track_name,
bpm,
`danceability_.`,
`speechiness_.`,
`energy_.`,
`acousticness_.`
) %>%
pivot_longer(cols = -track_name, names_to = "feature", values_to = "value")
# Step 2: Create circular barplot
ggplot(top_2023_features, aes(x = feature, y = value, fill = track_name)) +
geom_bar(stat = "identity", position = "dodge") +
coord_polar() +
labs(title = "Audio Feature Metrics for Top 5 Songs in 2023",
x = "",
y = "",
fill = "Track Name") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12, face = "bold"))
yearly_top3_songs <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 3, with_ties = TRUE) %>%
ungroup()
#View(yearly_top3_songs)
# Filter for 2023 top 3 songs from your previously filtered data
top3_2023 <- yearly_top3_songs %>%
filter(released_year == 2023)
# Now explicitly call dplyr::select() to avoid masking
top3_2023_features <- top3_2023 %>%
dplyr::select(
track_name,
bpm,
`danceability_.`,
`speechiness_.`,
`energy_.`,
`acousticness_.`
) %>%
pivot_longer(cols = -track_name, names_to = "feature", values_to = "value")
# Step 2: Create circular barplot
ggplot(top3_2023_features, aes(x = feature, y = value, fill = track_name)) +
geom_bar(stat = "identity", position = "dodge") +
coord_polar() +
labs(title = "Audio Feature Metrics for Top 5 Songs in 2023",
x = "",
y = "",
fill = "Track Name") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12, face = "bold"))
# Load clean libraries (force reloading if needed)
library(dplyr)
library(tibble)
library(tidyr)
library(fmsb)
library(scales)
G2;H2;Warningh: package ‘scales’ was built under R version 4.3.3g
G3;
Attaching package: ‘scales’
gG3;The following object is masked from ‘package:purrr’:
discard
gG3;The following object is masked from ‘package:readr’:
col_factor
g
# Clean column names using backticks explicitly
top_2023_clean <- top_2023 %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
)
# Select only relevant columns
radar_data <- dplyr::select(top_2023_clean, track_name, bpm, danceability, speechiness, energy, acousticness)
# Normalize the metrics to range [0, 1]
radar_data_norm <- radar_data %>%
mutate(across(where(is.numeric) & !track_name, ~ scales::rescale(.x, to = c(0, 1))))
# Create max and min rows for required radar structure
max_min <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
# Add the song data with rownames as song titles
radar_matrix <- bind_rows(
max_min,
radar_data_norm %>% column_to_rownames("track_name")
)
# Assign colors per song
colors_border <- rainbow(nrow(radar_matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
# Plot
fmsb::radarchart(
radar_matrix,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = "Top 5 Songs in 2023 — Audio Features Radar Chart"
)
legend(
"topright",
legend = rownames(radar_matrix)[-c(1, 2)],
bty = "n",
pch = 20,
col = colors_border,
text.col = "black",
cex = 0.8
)
# Clean column names using backticks explicitly
top3_2023_clean <- top3_2023 %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
)
# Select only relevant columns
radar_data3 <- dplyr::select(top3_2023_clean, track_name, bpm, danceability, speechiness, energy, acousticness)
# Normalize the metrics to range [0, 1]
radar_data_norm3 <- radar_data3 %>%
mutate(across(where(is.numeric) & !track_name, ~ scales::rescale(.x, to = c(0, 1))))
# Create max and min rows for required radar structure
max_min3 <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
# Add the song data with rownames as song titles
radar_matrix3 <- bind_rows(
max_min3,
radar_data_norm3 %>% column_to_rownames("track_name")
)
# Assign colors per song
colors_border <- rainbow(nrow(radar_matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
# Plot
fmsb::radarchart(
radar_matrix3,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = "Top 3 Songs in 2023 — Audio Features Radar Chart"
)
legend(
"topright",
legend = rownames(radar_matrix3)[-c(1, 2)],
bty = "n",
pch = 20,
col = colors_border,
text.col = "black",
cex = 0.8
)
# Select only numeric columns
numeric_cols <- spotify %>%
select(where(is.numeric))
numeric_cols
# Calculate correlation of all numeric columns with 'streams'
correlations <- cor(numeric_cols, use = "complete.obs")
correlations
artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts streams in_apple_playlists
artist_count 1.000000000 0.061445644 0.009720347 -0.044766245 -0.0746868039 -0.002421656 -0.1090468634 -0.008712241
released_year 0.061445644 1.000000000 0.031372926 0.160042169 -0.3305741123 0.100988420 -0.1483509269 -0.155648773
released_month 0.009720347 0.031372926 1.000000000 -0.015820232 -0.0187633639 -0.031526077 0.0413240023 0.007380967
released_day -0.044766245 0.160042169 -0.015820232 1.000000000 -0.0320967227 0.042203010 0.0410748362 0.028622345
in_spotify_playlists -0.074686804 -0.330574112 -0.018763364 -0.032096723 1.0000000000 0.173307807 0.7650951338 0.709922084
in_spotify_charts -0.002421656 0.100988420 -0.031526077 0.042203010 0.1733078066 1.000000000 0.2454749140 0.213322335
streams -0.109046863 -0.148350927 0.041324002 0.041074836 0.7650951338 0.245474914 1.0000000000 0.663657168
in_apple_playlists -0.008712241 -0.155648773 0.007380967 0.028622345 0.7099220838 0.213322335 0.6636571679 1.000000000
in_apple_charts -0.079655066 0.007650642 -0.010603129 0.009855360 0.2087053786 0.565321488 0.2508103705 0.322358302
in_deezer_playlists -0.073406225 -0.265234545 -0.035433532 -0.041980554 0.7880546875 0.151785663 0.7185929567 0.645914528
in_deezer_charts 0.022218537 0.103287112 -0.001921194 0.063555630 0.1952907401 0.558419963 0.2594696320 0.409688235
in_shazam_charts -0.031812269 0.054492378 -0.090799317 0.040728906 0.1111503800 0.594678886 0.0587456970 0.187401561
bpm -0.067047448 -0.041957657 -0.051936323 -0.048020996 0.0260085534 0.028010413 0.0327164251 0.044415122
danceability_. 0.209581804 0.192054100 -0.034978955 0.076211130 -0.1066197808 0.075249362 -0.0754316227 0.011504320
valence_. 0.120784211 -0.064812792 -0.118074232 0.071279071 -0.0552336199 0.056602171 -0.0584550791 0.053187299
energy_. 0.149966302 0.130105474 -0.081977712 0.064572106 -0.0494256700 0.104328458 -0.0496657926 0.074416649
acousticness_. -0.101620287 -0.169751059 0.039266560 -0.010279631 0.0001543819 -0.078095007 0.0013286969 -0.088265650
instrumentalness_. -0.052814944 -0.014754771 0.031122232 0.007126726 0.0121080272 -0.012565007 -0.0009670221 -0.045488723
liveness_. 0.041035230 0.007441171 -0.017825352 0.002619619 -0.0339739648 -0.039153639 -0.0387277529 -0.046255149
speechiness_. 0.117955768 0.126711891 0.030599526 -0.017347379 -0.0719087372 -0.086192083 -0.0907281501 -0.101941835
in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm danceability_. valence_. energy_. acousticness_.
artist_count -0.079655066 -0.073406225 0.022218537 -0.031812269 -0.0670474482 0.209581804 0.120784211 0.149966302 -0.1016202867
released_year 0.007650642 -0.265234545 0.103287112 0.054492378 -0.0419576571 0.192054100 -0.064812792 0.130105474 -0.1697510593
released_month -0.010603129 -0.035433532 -0.001921194 -0.090799317 -0.0519363227 -0.034978955 -0.118074232 -0.081977712 0.0392665600
released_day 0.009855360 -0.041980554 0.063555630 0.040728906 -0.0480209963 0.076211130 0.071279071 0.064572106 -0.0102796313
in_spotify_playlists 0.208705379 0.788054688 0.195290740 0.111150380 0.0260085534 -0.106619781 -0.055233620 -0.049425670 0.0001543819
in_spotify_charts 0.565321488 0.151785663 0.558419963 0.594678886 0.0280104129 0.075249362 0.056602171 0.104328458 -0.0780950070
streams 0.250810371 0.718592957 0.259469632 0.058745697 0.0327164251 -0.075431623 -0.058455079 -0.049665793 0.0013286969
in_apple_playlists 0.322358302 0.645914528 0.409688235 0.187401561 0.0444151222 0.011504320 0.053187299 0.074416649 -0.0882656498
in_apple_charts 1.000000000 0.198692411 0.356675982 0.443346418 0.0512089175 -0.003976097 0.061427394 0.153590558 -0.1050831002
in_deezer_playlists 0.198692411 1.000000000 0.218281108 0.135919298 0.0453831408 -0.104850821 -0.025849620 -0.028605485 0.0288379089
in_deezer_charts 0.356675982 0.218281108 1.000000000 0.374829138 0.0370517105 0.087187954 0.075155386 0.108571701 -0.0439176997
in_shazam_charts 0.443346418 0.135919298 0.374829138 1.000000000 0.0891578410 -0.010179394 -0.003080391 0.095095549 -0.0716735649
bpm 0.051208918 0.045383141 0.037051710 0.089157841 1.0000000000 -0.140710959 0.050484657 0.003536259 -0.0020473755
danceability_. -0.003976097 -0.104850821 0.087187954 -0.010179394 -0.1407109592 1.000000000 0.390335848 0.186243358 -0.2390078796
valence_. 0.061427394 -0.025849620 0.075155386 -0.003080391 0.0504846571 0.390335848 1.000000000 0.354253808 -0.0680708838
energy_. 0.153590558 -0.028605485 0.108571701 0.095095549 0.0035362587 0.186243358 0.354253808 1.000000000 -0.5547718398
acousticness_. -0.105083100 0.028837909 -0.043917700 -0.071673565 -0.0020473755 -0.239007880 -0.068070884 -0.554771840 1.0000000000
instrumentalness_. -0.010658818 0.021617457 -0.002299823 -0.015732282 -0.0009552758 -0.098154216 -0.136058212 -0.032914831 0.0332206982
liveness_. -0.001551996 -0.005142997 0.002914949 -0.045209630 0.0005645641 -0.093272303 0.016319569 0.120967010 -0.0406689579
speechiness_. -0.157645853 -0.108361699 -0.073955127 -0.081685578 0.0247134810 0.173420342 0.036580343 -0.017125796 -0.0238770164
instrumentalness_. liveness_. speechiness_.
artist_count -0.0528149443 0.0410352297 0.11795577
released_year -0.0147547713 0.0074411712 0.12671189
released_month 0.0311222324 -0.0178253521 0.03059953
released_day 0.0071267258 0.0026196188 -0.01734738
in_spotify_playlists 0.0121080272 -0.0339739648 -0.07190874
in_spotify_charts -0.0125650073 -0.0391536392 -0.08619208
streams -0.0009670221 -0.0387277529 -0.09072815
in_apple_playlists -0.0454887232 -0.0462551494 -0.10194183
in_apple_charts -0.0106588177 -0.0015519961 -0.15764585
in_deezer_playlists 0.0216174566 -0.0051429975 -0.10836170
in_deezer_charts -0.0022998235 0.0029149486 -0.07395513
in_shazam_charts -0.0157322822 -0.0452096305 -0.08168558
bpm -0.0009552758 0.0005645641 0.02471348
danceability_. -0.0981542162 -0.0932723026 0.17342034
valence_. -0.1360582123 0.0163195694 0.03658034
energy_. -0.0329148310 0.1209670100 -0.01712580
acousticness_. 0.0332206982 -0.0406689579 -0.02387702
instrumentalness_. 1.0000000000 -0.0488636800 -0.08664221
liveness_. -0.0488636800 1.0000000000 -0.04518074
speechiness_. -0.0866422067 -0.0451807367 1.00000000
# Extract just the correlations with 'streams'
cor_with_streams <- correlations["streams", ]
cor_with_streams
artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts streams
-0.1090468634 -0.1483509269 0.0413240023 0.0410748362 0.7650951338 0.2454749140 1.0000000000
in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm danceability_.
0.6636571679 0.2508103705 0.7185929567 0.2594696320 0.0587456970 0.0327164251 -0.0754316227
valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_.
-0.0584550791 -0.0496657926 0.0013286969 -0.0009670221 -0.0387277529 -0.0907281501
# Sort and view
sort(cor_with_streams, decreasing = TRUE)
streams in_spotify_playlists in_deezer_playlists in_apple_playlists in_deezer_charts in_apple_charts in_spotify_charts
1.0000000000 0.7650951338 0.7185929567 0.6636571679 0.2594696320 0.2508103705 0.2454749140
in_shazam_charts released_month released_day bpm acousticness_. instrumentalness_. liveness_.
0.0587456970 0.0413240023 0.0410748362 0.0327164251 0.0013286969 -0.0009670221 -0.0387277529
energy_. valence_. danceability_. speechiness_. artist_count released_year
-0.0496657926 -0.0584550791 -0.0754316227 -0.0907281501 -0.1090468634 -0.1483509269
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists + danceability_. + energy_. + valence_., data = spotify)
summary(model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists + danceability_. + energy_. + valence_.,
data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.068e+09 -1.246e+08 -3.334e+07 9.771e+07 1.301e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 182491364 45594165 4.003 6.80e-05 ***
in_spotify_playlists 42130 3557 11.844 < 2e-16 ***
in_deezer_playlists 566032 74514 7.596 7.90e-14 ***
in_apple_playlists 1507560 189228 7.967 5.11e-15 ***
danceability_. 223598 592024 0.378 0.7058
energy_. -717839 509189 -1.410 0.1590
valence_. -637129 379394 -1.679 0.0934 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 229500000 on 866 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.6886, Adjusted R-squared: 0.6864
F-statistic: 319.2 on 6 and 866 DF, p-value: < 2.2e-16
# Refitting the model with only significant predictors
refined_model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = spotify)
# Summary of the refined model
summary(refined_model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists, data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.082e+09 -1.173e+08 -3.519e+07 9.715e+07 1.311e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 119309688 10537578 11.322 < 2e-16 ***
in_spotify_playlists 43264 3526 12.271 < 2e-16 ***
in_deezer_playlists 565426 74598 7.580 8.88e-14 ***
in_apple_playlists 1427508 186160 7.668 4.67e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.3e+08 on 869 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.686, Adjusted R-squared: 0.6849
F-statistic: 632.9 on 3 and 869 DF, p-value: < 2.2e-16
# Refit the model with complete cases only
data_complete <- spotify %>%
select(streams, in_spotify_playlists, in_deezer_playlists, in_apple_playlists) %>%
na.omit()
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = data_complete)
# Add predictions to the complete data
# Predict streams
predicted_streams <- predict(model, newdata = data_complete)
# Add predictions to the data frame
data_complete$predicted_streams <- predicted_streams
data_complete
# Plot actual vs predicted
ggplot(data_complete, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted Streams",
x = "Actual Streams",
y = "Predicted Streams") +
theme_minimal()
# Plot residuals
residuals <- model$residuals
View(data_complete)
ggplot(data_complete, aes(x = predicted_streams, y = residuals)) +
geom_point(alpha = 0.6, color = "darkorange") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residual Plot",
x = "Predicted Streams",
y = "Residuals") +
theme_minimal()
# Step 1: Load required libraries
library(caret)
# Step 2: Set seed for reproducibility
set.seed(123)
# Step 3: Define training control for 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Step 4: Define the model formula (same predictors as before)
model_formula <- streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists
# Step 5: Fit the linear regression model using caret::train()
cv_model <- train(
model_formula,
data = data_complete,
method = "lm",
trControl = train_control
)
# Step 6: Review cross-validation results
print(cv_model)
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Optional: Plot predictions vs. actuals again using cv_model$finalModel if desired
# Fit the final model on full data
final_model <- train(
streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists,
data = data_complete,
method = "lm"
)
# View final coefficients
coef(final_model$finalModel)
(Intercept) in_spotify_playlists in_deezer_playlists in_apple_playlists
119309687.75 43263.74 565425.67 1427508.47
#Create a new data frame with predictor values
# Replace these numbers with your actual input values
new_input <- data.frame(
in_spotify_playlists = 2500,
in_deezer_playlists = 50,
in_apple_playlists = 250
)
# 3. Predict streams based on new inputs
predicted_streams <- predict(cv_model, newdata = new_input)
# View prediction
predicted_streams
1
612617449
# Load required packages
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Predict Song Streams"),
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(cv_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
?cv_model
No documentation for ‘cv_model’ in specified packages and libraries:
you could try ‘??cv_model’
cv_model
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Assume `final_model` is already trained with lm()
colSums(is.na(data_complete))
streams in_spotify_playlists in_deezer_playlists in_apple_playlists predicted_streams
0 0 0 0 0
# Then you need to extract the final linear model from the `train` object
# before using it for prediction with confidence and prediction intervals
lm_model <- cv_model$finalModel
# Now you can safely use predict with interval = "confidence" and "prediction"
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
)
# Plot
ggplot(plot_data, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Actual Streams",
y = "Predicted Streams"
) +
theme_minimal()
# First, ensure that you have your predictions with intervals set up properly
# Extract the linear model from caret's train object
lm_model <- cv_model$finalModel
# Generate predictions with both confidence and prediction intervals
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
) %>%
arrange(streams) # sort by actual streams for smooth ribbons
# Plot with ggplot2
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
# Plot with confidence and prediction intervals as lines (no shaded ribbons)
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(
title = "Actual vs Predicted Streams with Confidence and Prediction Interval Lines",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
ggplot(plot_data, aes(x = in_spotify_playlists, y = streams)) +
geom_point() +
geom_line(aes(y = predicted_streams), color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Model Fit with Confidence Interval",
y = "Streams", x = "Spotify Playlists") +
theme_minimal()
# Load required packages
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Predict Spotify Song Streams"),
tabsetPanel(
tabPanel("Visualize by Mode",
sidebarLayout(
sidebarPanel(
checkboxGroupInput("selected_modes", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("modePlot")
)
)
),
tabPanel("Predict Streams",
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(final_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
output$modePlot <- renderPlot({
req(input$selected_modes)
filtered_data <- subset(spotify, mode %in% input$selected_modes)
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
ggplot(plot_data, aes(x = predicted_streams, y = in_spotify_playlists)) +
geom_point(aes(y = streams), alpha = 0.5) +
geom_line() +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Prediction with Confidence Intervals")
plot(lm_model)
plot(lm_model, which = 5)
# Calculate Cook's Distance
cooksD <- cooks.distance(lm_model)
# Set a common threshold (4 / n)
threshold <- 4 / nrow(data_complete)
# Find influential points
influential_points <- which(cooksD > threshold)
influential_points
X15 X16 X23 X42 X43 X48 X52 X58 X62 X72 X75 X85 X88 X111 X116 X122 X133 X147 X153 X155 X159 X160 X165 X167 X168 X170 X171 X181 X185 X187
15 16 23 42 43 48 51 55 59 68 70 78 80 98 102 108 116 127 132 134 137 138 142 144 145 146 147 152 155 157
X188 X193 X232 X240 X304 X321 X366 X367 X369 X379 X393 X396 X411 X423 X425 X426 X434 X444 X455 X458 X461 X463 X467 X470 X472 X496 X506 X508 X511 X514
158 161 199 207 267 284 325 326 328 338 352 355 367 379 381 382 390 397 408 411 414 416 420 423 425 449 459 461 464 467
X520 X531 X536 X556 X558 X563 X566 X567 X576 X582 X585 X592 X599 X600 X614 X617 X620 X622 X639 X658 X675 X719 X720 X740 X765 X841 X857 X864 X900 X903
473 484 489 509 511 516 519 520 528 534 537 544 550 551 564 566 569 570 583 598 610 650 651 667 688 764 780 787 821 824
X912
832
# Create a new dataset excluding influential rows
data_no_influential <- data_complete[-influential_points, ]
data_no_influential
# Refit the model using caret with cross-validation
cv_model_clean <- train(
model_formula,
data = data_no_influential,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
cv_model$results # Original model
cv_model_clean$results # Model without influential points
model_full <- lm(model_formula, data = data_complete)
model_reduced <- lm(model_formula, data = data_no_influential)
AIC(model_full, model_reduced)
G2;H2;Warningh in AIC.default(model_full, model_reduced) :
models are not all fitted to the same number of observationsg
BIC(model_full, model_reduced)
G2;H2;Warningh in BIC.default(model_full, model_reduced) :
models are not all fitted to the same number of observationsg
# First, ensure that you have your predictions with intervals set up properly
# Extract the linear model from caret's train object
lm_model_clean <- cv_model_clean$finalModel
# Generate predictions with both confidence and prediction intervals
pred_conf_clean <- predict(lm_model_clean, newdata = data_no_influential, interval = "confidence")
pred_pred_clean <- predict(lm_model_clean, newdata = data_no_influential, interval = "prediction")
# Combine everything into a data frame
plot_data_clean <- data_no_influential %>%
mutate(
predicted_streams = pred_conf_clean[, "fit"],
conf_low = pred_conf_clean[, "lwr"],
conf_high = pred_conf_clean[, "upr"],
pred_low = pred_pred_clean[, "lwr"],
pred_high = pred_pred_clean[, "upr"]
) %>%
arrange(streams) # sort by actual streams for smooth ribbons
# Plot with confidence and prediction intervals as lines (no shaded ribbons)
ggplot(plot_data_clean, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(
title = "Actual vs Predicted Streams with Confidence and Prediction Interval Lines",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()